library(tidyverse)
library(janitor)
library(here)
community_belonging <- clean_names(read_csv(here("raw_data/community_belonging.csv")))
Rows: 43611 Columns: 13── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): FeatureCode, Measurement, Units, Community belonging, Gender, Urban Rural Classification, SIMD quintiles, Type Of Tenure, Household Type, Ethnicity,...
dbl (2): DateCode, Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
green_spaces <- clean_names(read_csv(here("raw_data/green_spaces.csv")))
Rows: 38451 Columns: 13── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): FeatureCode, Measurement, Units, Distance to Nearest Green or Blue Space, Age, Gender, Urban Rural Classification, SIMD quintiles, Type Of Tenure, H...
dbl (2): DateCode, Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
neighbourhood_rating <- clean_names(read_csv(here("raw_data/neighbourhood_rating.csv")))
Rows: 38055 Columns: 13── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): FeatureCode, Measurement, Units, Neighbourhood rating, Gender, Urban Rural Classification, SIMD quintiles, Type Of Tenure, Household Type, Ethnicity...
dbl (2): DateCode, Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
community_belonging
green_spaces
neighbourhood_rating
community_belonging %>%
summarise(across(.cols = everything(), .fns = ~sum(is.na(.x))))
There are no NA values
community_belonging %>%
distinct(units)
There is only 1 value in this column. It can be removed in the cleaning script
community_belonging %>%
distinct() %>%
nrow()
[1] 43611
community_belonging %>%
nrow()
[1] 43611
No duplicates
community_belonging %>%
filter(feature_code == "S12000036", year == "2016",
walking_distance_to_nearest_greenspace == "More than 10 minutes")
Error in `filter()`:
ℹ In argument: `year == "2016"`.
Caused by error in `year == "2016"`:
! comparison (==) is possible only for atomic and list types
Backtrace:
1. community_belonging %>% ...
3. dplyr:::filter.data.frame(...)
4. dplyr:::filter_rows(.data, dots, by)
5. dplyr:::filter_eval(...)
7. mask$eval_all_filter(dots, env_filter)
8. dplyr (local) eval()
There appear to be some missing values such as how many people voted “Don’t know” to community belonging where the walking distance is greater than 10 minutes. Is it sensible to assume this is 0?
community_belonging %>%
filter(feature_code == "S12000036", year == "2016",
walking_distance_to_nearest_greenspace == "More than 10 minutes",
measurement == "Percent") %>%
summarise(sum(value))
Yes, the others add up to 100% so 0% voted “Don’t know”.
community_belonging %>%
filter(date_code == 2019,
gender == "Female")
The feature codes look like council areas. The name of the area if
going to be more useful than codes.
council_areas <- clean_names(read_csv(here("raw_data/council_area_codes.csv")))
Rows: 44 Columns: 14── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): CA, CAName, HSCP, HSCPName, HB, HBName, Country
dbl (7): _id, CADateEnacted, CADateArchived, HSCPDateEnacted, HSCPDateArchived, HBDateEnacted, HBDateArchived
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
council_areas_clean <- council_areas %>%
select(ca, ca_name) %>%
add_row(ca = "S92000003" ,ca_name = "Scotland")
Some rows appear to be for the whole of Scotland so I have added a row for this code.
community_belonging <- community_belonging %>%
left_join(council_areas_clean, by = c("feature_code" = "ca"))
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
When plotting the order of community belonging will not make sense, therefore this should be factored.
community_belonging <- community_belonging %>%
mutate(community_belonging = factor(community_belonging,
levels = c("Not at all strongly",
"Not very strongly",
"Don't know",
"Fairly strongly",
"Very strongly")))
Each row appears to be a demographic (eg. Female, more than 10 minutes to nearest greenspace, etc), category of community belonging, measurement, and the percentage of people of that demographic that scored their community belonging at that category.
community_belonging %>%
distinct(walking_distance_to_nearest_greenspace)
community_belonging %>%
group_by(walking_distance_to_nearest_greenspace) %>%
summarise(total = n())
(105/(4107+1035))*100
Walking distance to nearest greenspace is likely self reported, only 2% of people do not know the answer and there is a reasonable amount of data that do know.
Does this add up to 100%?
community_belonging %>%
filter(date_code == 2019,
gender == "Female",
ca_name == "West Lothian") %>%
select(measurement:community_belonging) %>%
pivot_wider(names_from = measurement, values_from = value) %>%
clean_names() %>%
summarise(sum(percent))
Yes.
community_belonging %>%
filter(walking_distance_to_nearest_greenspace != "All",
ca_name == "Scotland",
date_code == 2019) %>%
pivot_wider(names_from = measurement, values_from = value) %>%
clean_names() %>%
ggplot(aes(community_belonging, percent, fill = walking_distance_to_nearest_greenspace)) +
geom_col(position = "dodge") +
facet_wrap(~walking_distance_to_nearest_greenspace)
community_belonging %>%
filter(walking_distance_to_nearest_greenspace != "All",
ca_name == "Scotland",
date_code == 2019) %>%
pivot_wider(names_from = measurement, values_from = value) %>%
clean_names() %>%
group_by(walking_distance_to_nearest_greenspace) %>%
summarise(total = sum(percent))
It is difficult to quanity the difference.
Perhaps we could use a score for community belonging? (0-5 or
(-2)-(+2))
community_belonging %>%
distinct(community_belonging)
community_belonging_scored <- community_belonging %>%
mutate(belonging_score = case_when(
community_belonging == "Not at all strongly" ~ 1,
community_belonging == "Not very strongly" ~ 2,
community_belonging == "Don't know" ~ 3,
community_belonging == "Fairly strongly" ~ 4,
community_belonging == "Very strongly" ~ 5,
),
belonging_score_zeroed = case_when(
community_belonging == "Not at all strongly" ~ -2,
community_belonging == "Not very strongly" ~ -1,
community_belonging == "Don't know" ~ 0,
community_belonging == "Fairly strongly" ~ 1,
community_belonging == "Very strongly" ~ 2,
))
community_belonging_scored %>%
filter(walking_distance_to_nearest_greenspace != "All",
ca_name == "Scotland",
date_code == 2019) %>%
pivot_wider(names_from = measurement, values_from = value) %>%
clean_names() %>%
mutate(score = percent * belonging_score,
score_zero = percent * belonging_score_zeroed) %>%
group_by(walking_distance_to_nearest_greenspace) %>%
summarise(overall_belonging = mean(score),
overall_belonging_zero = mean(score_zero)) %>%
ggplot(aes(walking_distance_to_nearest_greenspace, overall_belonging_zero)) +
geom_col()
With normal scoring the maximum possible is 100% scoring 5 -> 500
(0:500)
With zeroed score the maximum possible is 100% scoring 2 -> 200
(-200:200)
This might be better on a scale of -1 to 1.
community_belonging_scored %>%
filter(walking_distance_to_nearest_greenspace != "All",
ca_name == "Scotland",
date_code == 2019) %>%
pivot_wider(names_from = measurement, values_from = value) %>%
clean_names() %>%
mutate(score_zero = (percent * belonging_score_zeroed)/200) %>%
group_by(walking_distance_to_nearest_greenspace) %>%
summarise(overall_belonging_zero = mean(score_zero)) %>%
ggplot(aes(walking_distance_to_nearest_greenspace, overall_belonging_zero)) +
geom_col()
community_belonging_scored %>%
filter(walking_distance_to_nearest_greenspace != "All",
ca_name == "Scotland",
date_code == 2019) %>%
pivot_wider(names_from = measurement, values_from = value) %>%
clean_names() %>%
mutate(score_zero = (percent * belonging_score_zeroed)/200) %>%
ggplot(aes(walking_distance_to_nearest_greenspace, score_zero)) +
geom_point()
App should explore community belonging for each group.
green_spaces
green_spaces %>%
distinct(age)
This dataset tells us who has access to green or blue spaces within 10 minutes.
green_spaces %>%
left_join(council_areas_clean, by = c("feature_code" = "ca")) %>%
filter(ca_name == "Scotland",
urban_rural_classification == "Urban",
measurement == "Percent") %>%
ggplot(aes(date_code, value, colour = distance_to_nearest_green_or_blue_space)) +
geom_line()
neighbourhood_rating
neighbourhood_rating %>%
distinct() %>%
nrow()
neighbourhood_rating %>%
nrow()
neighbourhood_rating %>%
distinct(neighbourhood_rating)